## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## 
## Attaching package: 'plotly'
## 
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## 
## The following object is masked from 'package:graphics':
## 
##     layout

import NHANES datafiles

Imported demographic, HIV test, and sexual behavior dataframes 1999 - 2016.

demo_list = 
  list.files(path = "NHANES/DEMO", full.names = TRUE) 
hiv_list = 
  list.files(path = "NHANES/HIV", full.names = TRUE) 
sxq_list = 
  list.files(path = "NHANES/SXQ", full.names = TRUE) 


load_files = function(x){
  
  filetype =
    str_extract(x, "DEMO|HIV|SXQ")
  
  year = 
    str_extract(x, "(?<=_)[A-Z]")
  
data = 
    read_xpt(x) |>
    janitor::clean_names()|>
    mutate(filetype = filetype, year = year)
    
}


demo_list_output = 
  map(demo_list, load_files) 

hiv_list_output = 
  map(hiv_list, load_files) 

sxq_list_output = 
  map(sxq_list, load_files)

join_pair = function(df1, df2) {
  left_join(df1, df2, by = c("seqn", "year"))
}


hiv_demo_list = 
  map2(hiv_list_output, demo_list_output, ~join_pair(.x, .y))
  
hiv_demo_sxq_df = 
  map2(hiv_demo_list, sxq_list_output, ~join_pair(.x, .y)) |>
  bind_rows() 

Re-coded variables of interest: Restricted particpants’s age between 20 and 49

cleaned_joined_df = 
  hiv_demo_sxq_df |>
  filter(ridageyr >= 20 & ridageyr <= 49) |>
  mutate(hiv = case_when(
    lbdhi == 1 | lbxhivc == 1 ~ 1,
    lbdhi == 2 | lbxhivc == 2 ~ 2)
    )|> 
  mutate(MSM = case_when(
    year %in% c("A","B","C") ~ sxq220,
    year %in% c("D","E","F","G","H","I") ~ sxq550)
    )|>
   mutate(WSW = case_when(
    year %in% c("A","B","C") ~ sxq150,
    year %in% c("D","E","F","G","H","I") ~ sxq490)
    )|>
  mutate(year = recode(year, 
                       "A" = "1999-2000",
                       "B" = "2001-2002",
                       "C" = "2003-2004",
                       "D" = "2005-2006",
                       "E" = "2007-2008",
                       "F" = "2009-2010",
                       "G" = "2011-2012",
                       "H" = "2013-2014",
                       "I" = "2015-2016"),
         hiv = recode(hiv,
                            "1" = "Reactive",
                            "2" = "Non-reactive"),
         gender = recode(riagendr, 
                         "1" = "Male",
                         "2" = "Female"),
         age = ridageyr,
         race = recode(ridreth1,
                       "1" = "Mexican American",
                       "2" = "Other Hispanic",
                       "3" = "Non-Hispanic White",
                       "4" = "Non-Hispanic Black",
                       "5" = "Other Race - Including Multi-Racial"),
         education = recode(dmdeduc2,
                            "1" = "Less than 9th grade",
                            "2" = "9-11th grade",
                            "3" = "High school graduate or equivalent",
                            "4" = "Some college or AA degree",
                            "5" = "College graduate or above",
                            "7" = NA_character_,
                            "9" = NA_character_),
         marriage = recode(dmdmartl,
                           "1" = "Married", 
                           "2" = "Widowed",
                           "3" = "Divorced",
                           "4" = "Separated",
                           "5" = "Never married",
                           "6" = "Living with partner",
                           "77" = NA_character_,
                           "99" = NA_character_)
         ) |>
  mutate(samesexcontact = case_when(
    (MSM %in% c(0, 7777, 77777,9999, 99999) | WSW %in% c(0, 7777, 77777, 9999, 99999)) ~ 0,
    (MSM >= 1 & MSM <= 600) | (WSW >= 1 & WSW <= 600) ~ 1
  )) |>
  select(seqn, hiv, age, gender, race, education, marriage, year, samesexcontact)
cleaned_joined_df |>
  group_by(samesexcontact, hiv) |>
  summarize(n = n())
## `summarise()` has grouped output by 'samesexcontact'. You can override using
## the `.groups` argument.
## # A tibble: 9 × 3
## # Groups:   samesexcontact [3]
##   samesexcontact hiv              n
##            <dbl> <chr>        <int>
## 1              0 Non-reactive  6826
## 2              0 Reactive        33
## 3              0 <NA>           137
## 4              1 Non-reactive   572
## 5              1 Reactive        48
## 6              1 <NA>            15
## 7             NA Non-reactive 15589
## 8             NA Reactive        43
## 9             NA <NA>          1136

Visualization - 1 Total number as y-axis value Across race

cleaned_joined_df %>% 
  filter(hiv == "Reactive") %>% 
  group_by(race, gender) %>%
  summarize(n = n(), age_mean = round(mean(age), 2)) %>% 
  ungroup() %>% 
  mutate(race = fct_reorder(race, n)) %>% 
  mutate(text_label = str_c("mean age: ", age_mean)) %>% 
  plot_ly(x = ~race, y = ~n, color = ~gender, type = "bar", text = ~text_label, colors = "viridis") %>% 
    layout(
      xaxis = list(title = "Race"),
      yaxis = list(title = "Total Number of HIV Positive Patients"),
      title = "The Total Number of HIV Positive Patients from 1999 to 2016 Across Races"
    )
## `summarise()` has grouped output by 'race'. You can override using the
## `.groups` argument.

Visualization - 1-1 Total percent as y-axis value

cleaned_joined_df %>% 
   mutate(hiv = ifelse(is.na(hiv), "NA", hiv)
    ) %>% 
  group_by(race, gender) %>%
  summarize(total_par = n(), age_mean = round(mean(age), 2), 
            hiv_percent = round(sum(hiv == "Reactive") / sum(hiv != "Reactive") * 100, 3)) %>%
  ungroup() %>% 
  mutate(race = fct_reorder(race, hiv_percent)) %>% 
  mutate(text_label = str_c("mean age: ", age_mean)) %>% 
  plot_ly(x = ~race, y = ~hiv_percent, color = ~gender, type = "bar", text = ~text_label, colors = "viridis") %>% 
    layout(
      xaxis = list(title = "Race"),
      yaxis = list(title = "Total Percent of HIV Positive Patients"),
      title = "The Total Percent of HIV Positive Patients from 1999 to 2016 Across Races"
    )
## `summarise()` has grouped output by 'race'. You can override using the
## `.groups` argument.

Visualization 2: by year

cleaned_joined_df %>% 
  mutate(hiv = ifelse(is.na(hiv), "NA", hiv)) %>% 
  group_by(year, gender) %>% 
  summarize(age_mean = round(mean(age), 2), 
            hiv_percent = round(sum(hiv == "Reactive") / sum(hiv != "Reactive") * 100, 3)) %>%
  ungroup() %>% 
  mutate(text_label = str_c("mean age: ", age_mean)) %>% 
  plot_ly(x = ~year, y = ~hiv_percent, color = ~gender, type = "scatter", mode = "lines+markers",
          text = ~text_label, colors = "viridis") %>% 
  layout(
      xaxis = list(title = "Year"),
      yaxis = list(title = "Total Percent of HIV Positive Patients"),
      title = "The Total Percent of HIV Positive Patients from 1999 to 2016"
    )
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.

Visualization 3: Across Education level

cleaned_joined_df %>% 
   mutate(hiv = ifelse(is.na(hiv), "NA", hiv)
    ) %>% 
  group_by(education, gender) %>%
  summarize(total_par = n(), age_mean = round(mean(age), 2), 
            hiv_percent = round(sum(hiv == "Reactive") / sum(hiv != "Reactive") * 100, 3)) %>%
  ungroup() %>% 
  mutate(education = forcats::fct_relevel(education, c("Less than 9th grade", "9-11th grade", 
                                         "High school graduate or equivalent", 
                                         "Some college or AA degree",
                                         "College graduate or above"))) %>% 
  mutate(text_label = str_c("mean age: ", age_mean)) %>% 
  plot_ly(x = ~education, y = ~hiv_percent, color = ~gender, type = "bar", text = ~text_label, colors = "viridis") %>% 
    layout(
      xaxis = list(title = "Education level"),
      yaxis = list(title = "Total Percent of HIV Positive Patients"),
      title = "The Total Percent of HIV Positive Patients from 1999 to 2016 Across Education Level"
    )
## `summarise()` has grouped output by 'education'. You can override using the
## `.groups` argument.
## Warning: Ignoring 2 observations

Visualization 4: Across marital status

cleaned_joined_df %>% 
   mutate(hiv = ifelse(is.na(hiv), "NA", hiv)) %>% 
  group_by(marriage, gender) %>%
  summarize(total_par = n(), age_mean = round(mean(age), 2), 
            hiv_percent = round(sum(hiv == "Reactive") / sum(hiv != "Reactive") * 100, 3)) %>%
  ungroup() %>% 
  mutate(marriage = fct_reorder(marriage, hiv_percent)) %>% 
  mutate(text_label = str_c("mean age: ", age_mean)) %>% 
  plot_ly(x = ~marriage, y = ~hiv_percent, color = ~gender, type = "bar", text = ~text_label, colors = "viridis") %>% 
    layout(
      xaxis = list(title = "Marital Status"),
      yaxis = list(title = "Total Percent of HIV Positive Patients"),
      title = "The Total Percent of HIV Positive Patients from 1999 to 2016 Across Marital Status"
    )
## `summarise()` has grouped output by 'marriage'. You can override using the
## `.groups` argument.
## Warning: Ignoring 2 observations

Visualization 5: Sexual behavior

cleaned_joined_df %>% 
   mutate(hiv = ifelse(is.na(hiv), "NA", hiv),
          samesexcontact = recode(samesexcontact, "0" = "No", 
                                  "1" = "Yes")) %>% 
  group_by(samesexcontact, gender) %>% 
  summarize(total_par = n(), age_mean = round(mean(age), 2), hiv_num = sum(hiv == "Reactive"),
            hiv_percent = round(sum(hiv == "Reactive") / sum(hiv != "Reactive") * 100, 3)) %>% 
  ungroup() %>% 
  mutate(text_label = str_c("mean age: ", age_mean)) %>% 
  plot_ly(x = ~samesexcontact, y = ~hiv_percent, color = ~gender, type = "bar", text = ~text_label, colors = "viridis") %>% 
    layout(
      xaxis = list(title = "Same Sex Sexual Behavior"),
      yaxis = list(title = "Total Percent of HIV Positive Patients"),
      title = "The Total Percent of HIV Positive Patients from 1999 to 2016 Across Sexual Behavior"
    )
## `summarise()` has grouped output by 'samesexcontact'. You can override using
## the `.groups` argument.
## Warning: Ignoring 2 observations